## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Question 1 - Linear regression with dummy variable [5 marks]

Consider the dataset “weatherhistory.csv”.

  1. Describe dependent variable pressure and independent variable temperature;
  2. Explore the relationship between pressure and temperature;
  3. Build a linear model considering temperature and pressure;
  4. Identify a dummy variable and build extended model considering dummy variable;
  5. Report your findings;

Answer

a) Describe dependent variable pressure and independent variable temperature.

Load the dataset

weatherHistory <- read.csv("weatherHistory.csv")
str(weatherHistory)
## 'data.frame':    96452 obs. of  13 variables:
##  $ X                  : int  2 3 4 5 6 7 8 9 10 11 ...
##  $ time               : chr  "2006-04-01 01:00:00.000 +0200" "2006-04-01 02:00:00.000 +0200" "2006-04-01 03:00:00.000 +0200" "2006-04-01 04:00:00.000 +0200" ...
##  $ summary            : chr  "Partly Cloudy" "Mostly Cloudy" "Partly Cloudy" "Mostly Cloudy" ...
##  $ precipType         : chr  "rain" "rain" "rain" "rain" ...
##  $ temperature        : num  9.36 9.38 8.29 8.76 9.22 ...
##  $ apparentTemperature: num  7.23 9.38 5.94 6.98 7.11 ...
##  $ humidity           : num  0.86 0.89 0.83 0.83 0.85 0.95 0.89 0.82 0.72 0.67 ...
##  $ windSpeed          : num  14.26 3.93 14.1 11.04 13.96 ...
##  $ windBearing        : int  259 204 269 259 258 259 260 259 279 290 ...
##  $ visibility         : num  15.8 15 15.8 15.8 15 ...
##  $ loudCover          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pressure           : num  1016 1016 1016 1017 1017 ...
##  $ dailySummary       : chr  "Partly cloudy throughout the day." "Partly cloudy throughout the day." "Partly cloudy throughout the day." "Partly cloudy throughout the day." ...
head(weatherHistory)
##   X                          time       summary precipType temperature
## 1 2 2006-04-01 01:00:00.000 +0200 Partly Cloudy       rain    9.355556
## 2 3 2006-04-01 02:00:00.000 +0200 Mostly Cloudy       rain    9.377778
## 3 4 2006-04-01 03:00:00.000 +0200 Partly Cloudy       rain    8.288889
## 4 5 2006-04-01 04:00:00.000 +0200 Mostly Cloudy       rain    8.755556
## 5 6 2006-04-01 05:00:00.000 +0200 Partly Cloudy       rain    9.222222
## 6 7 2006-04-01 06:00:00.000 +0200 Partly Cloudy       rain    7.733333
##   apparentTemperature humidity windSpeed windBearing visibility loudCover
## 1            7.227778     0.86   14.2646         259    15.8263         0
## 2            9.377778     0.89    3.9284         204    14.9569         0
## 3            5.944444     0.83   14.1036         269    15.8263         0
## 4            6.977778     0.83   11.0446         259    15.8263         0
## 5            7.111111     0.85   13.9587         258    14.9569         0
## 6            5.522222     0.95   12.3648         259     9.9820         0
##   pressure                      dailySummary
## 1  1015.63 Partly cloudy throughout the day.
## 2  1015.94 Partly cloudy throughout the day.
## 3  1016.41 Partly cloudy throughout the day.
## 4  1016.51 Partly cloudy throughout the day.
## 5  1016.66 Partly cloudy throughout the day.
## 6  1016.72 Partly cloudy throughout the day.
colnames(weatherHistory)
##  [1] "X"                   "time"                "summary"            
##  [4] "precipType"          "temperature"         "apparentTemperature"
##  [7] "humidity"            "windSpeed"           "windBearing"        
## [10] "visibility"          "loudCover"           "pressure"           
## [13] "dailySummary"
ggplot(weatherHistory, aes(y = temperature)) +
  geom_boxplot(fill = "lightblue", colour = "darkblue") +
  labs(title = "Boxplot of Temperature", y = "Temperature") +
  theme_minimal()

ggplot(weatherHistory, aes(y = pressure)) +
  geom_boxplot(fill = "lightblue", colour = "darkblue") +
  labs(title = "Boxplot of Temperature", y = "Temperature") +
  theme_minimal()

ggplot(weatherHistory, aes(x = temperature)) +
  geom_histogram(bins = 60, fill = "skyblue", color = "black") + # You can adjust the number of bins
  labs(title = "Histogram of Temperature", x = "Temperature (°C)", y = "Frequency") +
  theme_minimal()

ggplot(weatherHistory, aes(x = pressure)) +
  geom_histogram(bins = 200, fill = "lightgreen", color = "black") + # Adjust the number of bins as needed
  labs(title = "Histogram of Pressure", x = "Pressure (hPa)", y = "Frequency") +
  theme_minimal()

psych::describe(weatherHistory[, c("temperature", "pressure")])
##             vars     n    mean     sd  median trimmed   mad    min     max
## temperature    1 96452   11.93   9.55   12.00   11.79 10.40 -21.82   39.91
## pressure       2 96452 1003.24 116.97 1016.45 1016.54  6.81   0.00 1046.38
##               range  skew kurtosis   se
## temperature   61.73  0.09    -0.57 0.03
## pressure    1046.38 -8.42    69.26 0.38

Temperature has the mean 11.93 and the a standard deviation of 9.55. With this standard deviation showing significant fluctuations in the temperature. With the kurtosis of -0.57 indicates a distribution flatter than a normal distribution. We also have 0.03 of standard error of the mean that the measure is precise. We can conclude that temperature is a symmetric and a flatter variable.

Pressure data in the other hand shows a skew data with a lot of zero values. Hit has a kurtosis of 69.26 indicate thick tails and a skew of -8.42 indicating the data is concentrated in the right side of the mean.

Testing the temperature for normality

ggplot(weatherHistory, aes(sample=temperature)) + stat_qq()

ad.test(weatherHistory$temperature)
## 
##  Anderson-Darling normality test
## 
## data:  weatherHistory$temperature
## A = 202.38, p-value < 2.2e-16
ad.test(weatherHistory$pressure)
## 
##  Anderson-Darling normality test
## 
## data:  weatherHistory$pressure
## A = 31503, p-value < 2.2e-16

We can also see that neither temperature or pressure follows a normal distribution.

b) Explore the relationship between pressure and temperature.

To explore the relationship between pressure and temperature.

ggplot(weatherHistory, aes(x = temperature, y = pressure)) +
  geom_point(alpha = 0.5) +  # using some transparency for points
  geom_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x) +  
  labs(title = "Temperature vs. Pressure",
       x = "Temperature",
       y = "Pressure") +
  theme_minimal()

It seems that the pressure outliers are affecting the relationship between temperature and pressure. Let’s remove them e see the correlation.

weatherHistoryWithPressure <- weatherHistory %>%  
    filter(pressure != 0) %>%  
    filter(!is.na(temperature) & !is.na(pressure))


ggplot(weatherHistoryWithPressure, aes(x = temperature, y = pressure)) +
  geom_point(alpha = 0.5) +  # using some transparency for points
  geom_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x) + 
  labs(title = "Temperature vs. Pressure",
       x = "Temperature",
       y = "Pressure") +
  theme_minimal()

Let’s also do a Pearson’s correlation between the variables.

cor.test(weatherHistoryWithPressure$temperature, weatherHistoryWithPressure$pressure)
## 
##  Pearson's product-moment correlation
## 
## data:  weatherHistoryWithPressure$temperature and weatherHistoryWithPressure$pressure
## t = -100.75, df = 95162, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3161859 -0.3047036
## sample estimates:
##        cor 
## -0.3104561

The scatterplot suggest that we have a non-linear relationship between temperature and pressure. So doing a linear regression model will not capture or explain the relationship between them.

The correlation coefficient from the Pearson’s test indicates a negative relationship of 0.31. So it means when the temperature increases the pressure decreases.

The p-value is also too low (2.2e-16) suggesting the relationship is statistical significant.

c) Build a linear model considering temperature and pressure.

First let’s create the model.

lmTemperaturePressure <-  lm(pressure ~ temperature, 
                             data = weatherHistoryWithPressure)
anova(lmTemperaturePressure)
## Analysis of Variance Table
## 
## Response: pressure
##                Df  Sum Sq Mean Sq F value    Pr(>F)    
## temperature     1  554943  554943   10150 < 2.2e-16 ***
## Residuals   95162 5202744      55                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lmTemperaturePressure)
## 
## Call:
## lm(formula = pressure ~ temperature, data = weatherHistoryWithPressure)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.223  -4.050   0.450   4.501  25.527 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.020e+03  3.840e-02 26557.4   <2e-16 ***
## temperature -2.530e-01  2.511e-03  -100.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.394 on 95162 degrees of freedom
## Multiple R-squared:  0.09638,    Adjusted R-squared:  0.09637 
## F-statistic: 1.015e+04 on 1 and 95162 DF,  p-value: < 2.2e-16

Standartize co-efficients to get the result expressed in standart deviations.

lm.beta(lmTemperaturePressure)  
## 
## Call:
## lm(formula = pressure ~ temperature, data = weatherHistoryWithPressure)
## 
## Standardized Coefficients::
## (Intercept) temperature 
##          NA  -0.3104561
suppressWarnings(stargazer(lmTemperaturePressure, type="text"))
## 
## =================================================
##                          Dependent variable:     
##                     -----------------------------
##                               pressure           
## -------------------------------------------------
## temperature                   -0.253***          
##                                (0.003)           
##                                                  
## Constant                    1,019.837***         
##                                (0.038)           
##                                                  
## -------------------------------------------------
## Observations                   95,164            
## R2                              0.096            
## Adjusted R2                     0.096            
## Residual Std. Error      7.394 (df = 95162)      
## F Statistic         10,150.310*** (df = 1; 95162)
## =================================================
## Note:                 *p<0.1; **p<0.05; ***p<0.01

So we got the following results:

  • For every 1 degree increase in temperature the pressure decreases 0.253 units.
  • The p value is 0.01 who give us 99% confidence level. Also our F statistic is 10,150.310. Suggesting that the temperature is a good predictor of pressure.
  • R squared is 0.096 meaning that 9.6% of the pressure variance is explained by the temperature.
plot(residuals(lmTemperaturePressure), main = "Residuals of the Model",
     xlab = "Index", ylab = "Residuals")
abline(h = 0, col = "red")

plot(lmTemperaturePressure)

Looking at the residuals we can see that there is a funneling effect showing a spread out as the fitted values increase suggesting a non-constant variance. We get a bigger residuals for lower or higher pressures.

d) Identify a dummy variable and build extended model considering dummy variable.

Let’s use the categorical variable precipType as a dummy variable for our model

weatherHistoryDummy <- weatherHistory
weatherHistoryDummy$humidityScale <- cut(weatherHistory$windSpeed, 
                                     breaks=c(0, 0.25, 0.6, 1), 
                                     labels=c("dry", "comfortable", "humid"))
levels(weatherHistoryDummy$humidityScale)
## [1] "dry"         "comfortable" "humid"
table(weatherHistoryDummy$humidityScale)
## 
##         dry comfortable       humid 
##         293         453         268

So we have we possible values: dry, rain and snow.

Mapping them in our dataset.

weatherHistoryDummy$dry <- ifelse(weatherHistoryDummy$humidityScale == "dry", 1, 0)
weatherHistoryDummy$comfortable <- ifelse(weatherHistoryDummy$humidityScale == "comfortable", 1, 0)
weatherHistoryDummy$humid <- ifelse(weatherHistoryDummy$humidityScale == "humid", 1, 0)

table(weatherHistoryDummy[, c("humidityScale")])
## 
##         dry comfortable       humid 
##         293         453         268
table(weatherHistoryDummy[, c("dry")])
## 
##   0   1 
## 721 293
table(weatherHistoryDummy[, c("comfortable")])
## 
##   0   1 
## 561 453
table(weatherHistoryDummy[, c("humid")])
## 
##   0   1 
## 746 268

Lets consider the humid dummy variable and how it influenciates the pressure.

lmTempPresWithDummy <- lm(pressure ~ temperature + humid, data = weatherHistoryDummy)

e) Report your findings.

summary(lmTempPresWithDummy)
## 
## Call:
## lm(formula = pressure ~ temperature + humid, data = weatherHistoryDummy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1015.12     3.20     8.10    14.09    33.55 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1010.01243    4.91890 205.333   <2e-16 ***
## temperature   -0.08503    0.30486  -0.279    0.780    
## humid          6.05245    6.84406   0.884    0.377    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 96.1 on 1011 degrees of freedom
##   (95438 observations deleted due to missingness)
## Multiple R-squared:  0.0008502,  Adjusted R-squared:  -0.001126 
## F-statistic: 0.4302 on 2 and 1011 DF,  p-value: 0.6505
lm.beta(lmTempPresWithDummy)  
## 
## Call:
## lm(formula = pressure ~ temperature + humid, data = weatherHistoryDummy)
## 
## Standardized Coefficients::
##  (Intercept)  temperature        humid 
##           NA -0.008768578  0.027800804
suppressWarnings(stargazer(lmTempPresWithDummy, type="text"))
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                              pressure          
## -----------------------------------------------
## temperature                   -0.085           
##                               (0.305)          
##                                                
## humid                          6.052           
##                               (6.844)          
##                                                
## Constant                   1,010.012***        
##                               (4.919)          
##                                                
## -----------------------------------------------
## Observations                   1,014           
## R2                             0.001           
## Adjusted R2                   -0.001           
## Residual Std. Error     96.102 (df = 1011)     
## F Statistic            0.430 (df = 2; 1011)    
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

If we consider the humidity as dummy variable we get the model equaltion: \[ pressure = 1010.012 - 0.085 \times temperature + 6.052 \times humid \]

plot(lmTempPresWithDummy)

Conclusions:

  • The pressure is 1010.012 when the temperature and humidity is 0;
  • The pressure drops 0.085 per degree increase in temperature;
  • When the weather is humid the pressure increases 6.052;
  • R-squared is less than 0.001 indicating that the model explain less than 1% of the variance to the pressure;
  • A negative adjusted R-squared suggests that the model fits the data worse than just the mean of the dependent variable.
  • F-statistic is 0.4302 with a p-value of 0.6505, indicating that the model is not statistically significant.

Question 2 - Multiple Linear Regression [6 marks]

Consider the dataset “weatherhistory.csv”.

  1. Explore the relationship between pressure and windspeed;
  2. Build a linear model considering (windspeed, humidity, temperature) and pressure;
  3. Assess how model meets key assumptions of linear regression;
  4. Investigate a differential effect by adding dummy variable;
  5. Investigate interaction between effect for windspeed and dummy variable;
  6. Report your findings.

Answer

a) Explore the relationship between pressure and windspeed

First we create a scatterplot to visualize the correlation between pressure and windspeed.

ggplot(weatherHistory, aes(x = windSpeed, y = pressure)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", color = "blue", formula="y ~ x") +
  labs(title = "Pressure vs Wind Speed",
       x = "Wind Speed",
       y = "Pressure") +
  theme_minimal()

We have a lot of points with pressure equals 0. So let’s remove the outliers.

ggplot(weatherHistoryWithPressure, aes(x = windSpeed, y = pressure)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", color = "blue", formula="y ~ x") +
  labs(title = "Pressure vs Wind Speed",
       x = "Wind Speed",
       y = "Pressure") +
  theme_minimal()

Let’s also do a Pearson correaltion test

cor.test(weatherHistoryWithPressure$pressure, weatherHistoryWithPressure$windSpeed)
## 
##  Pearson's product-moment correlation
## 
## data:  weatherHistoryWithPressure$pressure and weatherHistoryWithPressure$windSpeed
## t = -80.909, df = 95162, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2596340 -0.2477448
## sample estimates:
##       cor 
## -0.253699

Considerations;

  • The regression line, shown in blue, indicates a negative linear relationship between wind speed and pressure. As wind speed increases, pressure tends to decrease;
  • The data points are concentrated for lower wind speeds suggesting higher variance for higher wind speeds. Also its uncommon wind speeds higher than 40;
  • The negative correlation of 0.25 suggest indicates a week relationship between both variables;
  • The p-value of 2.2e-16 for a confidence level of 95% suggests that the correlation is statistical significant.

b) Build a linear model considering (windspeed, humidity, temperature) and pressure

modelPressure <- lm(pressure ~ windSpeed + humidity +  temperature, data = weatherHistoryWithPressure)
summary(modelPressure)
## 
## Call:
## lm(formula = pressure ~ windSpeed + humidity + temperature, data = weatherHistoryWithPressure)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.153  -3.811   0.382   4.172  24.327 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.037e+03  1.526e-01  6796.3   <2e-16 ***
## windSpeed   -3.771e-01  3.323e-03  -113.5   <2e-16 ***
## humidity    -1.525e+01  1.512e-01  -100.9   <2e-16 ***
## temperature -4.479e-01  3.019e-03  -148.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.779 on 95160 degrees of freedom
## Multiple R-squared:  0.2404, Adjusted R-squared:  0.2404 
## F-statistic: 1.004e+04 on 3 and 95160 DF,  p-value: < 2.2e-16

Standartize co-efficients to get the result expressed in standart deviations.

lm.beta(modelPressure)
## 
## Call:
## lm(formula = pressure ~ windSpeed + humidity + temperature, data = weatherHistoryWithPressure)
## 
## Standardized Coefficients::
## (Intercept)   windSpeed    humidity temperature 
##          NA  -0.3341233  -0.3835520  -0.5497544
suppressWarnings(stargazer(modelPressure, type="text"))
## 
## =================================================
##                          Dependent variable:     
##                     -----------------------------
##                               pressure           
## -------------------------------------------------
## windSpeed                     -0.377***          
##                                (0.003)           
##                                                  
## humidity                     -15.253***          
##                                (0.151)           
##                                                  
## temperature                   -0.448***          
##                                (0.003)           
##                                                  
## Constant                    1,037.444***         
##                                (0.153)           
##                                                  
## -------------------------------------------------
## Observations                   95,164            
## R2                              0.240            
## Adjusted R2                     0.240            
## Residual Std. Error      6.779 (df = 95160)      
## F Statistic         10,037.910*** (df = 3; 95160)
## =================================================
## Note:                 *p<0.1; **p<0.05; ***p<0.01

We got the model equation: \[ pressure = 1037.444 - 0.377 \times windSpeed - 15.253 \times humidity - 0.448 \times temperature \] We can conclude the following:

  • Intercept 1037.444. Value expected when all predictors are zt zero.
  • All predictors are negative indicating then when they increase the pressure will drop for every unit each (0.377 windSpeed, 15.256 humidity and 0.448 temperature);
  • The humidity is the less influential in the pressure because it ranges between 0 and 1;
  • All the variables have a p-value lower tha 0.01 meaning that we have a confidence level bigger than 99% that they’re statistical significant;
  • The \(R^2\) and \(Ajusted\ R^2\) are 0.24 meaning that this model explain 24% of the pressure variance;
  • We have a residual standart error of approximately 6.779. Considering that the pressure ranges are bigger than 1000 is a small error;
  • The F Statistic 10,037.910 is a high and significant (p < 0.01) value confirm that is a statistic model.

c) Assess how model meets key assumptions of linear regression.

Linear relationship between the response variable and the predictors

linearity <- weatherHistoryWithPressure |> mutate(yhat = fitted(modelPressure), res = residuals(modelPressure))

ggplot(linearity, aes(x = yhat, y = res)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  geom_smooth(method = "gam", color = "blue", se = FALSE, formula = y ~ s(x, bs = "cs")) +
  labs(title = "Fitted vs Residuals",x = "Fitted",y = "Residuals")

ggplot(linearity, aes(x = windSpeed, y = res)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  geom_smooth(method = "gam", color = "blue", se = FALSE, formula = y ~ s(x, bs = "cs")) +
  labs(title = "WindSpeed vs Residuals",x = "WindSpeed",y = "Residuals")

ggplot(linearity, aes(x = humidity, y = res)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  geom_smooth(method = "gam", color = "blue", se = FALSE, formula = y ~ s(x, bs = "cs")) +
  labs(title = "humidity vs Residuals",x = "humidity",y = "Residuals")

ggplot(linearity, aes(x = temperature, y = res)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  geom_smooth(method = "gam", color = "blue", se = FALSE, formula = y ~ s(x, bs = "cs")) +
  labs(title = "temperature vs Residuals",x = "temperature",y = "Residuals")

We got the following results:

  • The first chart shows the Fitted Values vs Residuals. It shows a curve in the tails (lower and higher pressures), not suggesting that the relationship between the predictors and the dependent variable is not linear.

  • The chart for the WindSpeed is fairly linear until we get the 30 Km/h but after that we get a positive curve not suggesting a linear relation for higher windspeeds with the variance increasing.

  • In the other side we have the humidity. The variance decreases with lower humidity values (<0.25) and after is fairly constant suggesting a linear relationship for humidity >= 0.25.

  • The temperature variable agains the residuals show a sightly curve around the 0 line. Show we can say that the temperature is somewhat linear.

The result is that this model is not a linear model. We can improve it removing the outliers or do some kind of transformation.

Independence of errors

plot(resid(modelPressure), type = 'l', main = "Residuals Plot",
     xlab = "Index", ylab = "Residuals")
abline(h = 0, col = "red")

The Residuals Plot isplays the residuals from your linear regression model plotted against their index, which represents the order in which data were collected (you can confirm by the time of the data tha is in order).

As we can see the model does not presents obvious bias estimating the dependent variable.

We can assume that the model is independent of errors.

Homoscedasticity

The variance of the residuals should remain constant across all levels of the independent variables.

By the previous Residuals Plots we see that they present a funneling shape for all variables suggesting that the model exhibits heteroscedasticity. To confirm we can do Breusch-Pagan test.

bptest(modelPressure)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelPressure
## BP = 6388.8, df = 3, p-value < 2.2e-16

Given the result of the test we can conclude that with a small p-value (2.2e-16) we have strong evidence that the variance of the residuals is not constant.

Normality of Errors

Residuals should be approximately normally distributed. To verify we will do the histogram for the residuals

ggplot(linearity, aes(x = res)) +
  geom_histogram(aes(y = after_stat(density)), bins = 50) +
  ggtitle("Histogram of Residuals") +
  xlab("Residuals") +
  ylab("Density") +
  stat_function(fun = dnorm, args = list(mean = mean(linearity$res), sd = sd(linearity$res)), color = "blue", linewidth = 1) +
  theme_minimal()

qqnorm(linearity$res, main = "Q-Q Plot of Residuals")
qqline(linearity$res, col = "red")

ad.test(linearity$res)
## 
##  Anderson-Darling normality test
## 
## data:  linearity$res
## A = 331.88, p-value < 2.2e-16

By the qqplot and the result of Anderson-Darling normality test we can conclude that the residuals don’t not follow a normal distribution.

ad.test(linearity$res)
## 
##  Anderson-Darling normality test
## 
## data:  linearity$res
## A = 331.88, p-value < 2.2e-16

No Multicollinearity among Predictors

To measure how much the variance of an estimated regression coefficient increases if the predictors are correlated.

So if the Vif results can be interpreted based on the results per variable. If VIF equals:

  • 1: that there is no correlation between a given predictor with others;
  • = 1 && <= 5: Moderate correlation but no severe;

  • 5: Higly correlated.

vif(modelPressure)
##   windSpeed    humidity temperature 
##    1.086068    1.811151    1.720221
vif(modelPressure)
##   windSpeed    humidity temperature 
##    1.086068    1.811151    1.720221

Based on our results or predictors are Lightly correlate between them not affecting our predictive model. In other words: each coefficient produced by the regression model is likely reliable.

No Perfect Multicollinearity

cor(weatherHistoryWithPressure[c("windSpeed", "humidity", "temperature")])
##               windSpeed   humidity temperature
## windSpeed    1.00000000 -0.2242867  0.01018877
## humidity    -0.22428667  1.0000000 -0.63277644
## temperature  0.01018877 -0.6327764  1.00000000

Looking at the correlation matrix we only have one moderate negative correlation, that is humidity with temperature (-0.633). This value is significant but not indicative of high multicollinearity.

So we can conclude that there is no Perfect Multicollinearity in our model.

d) Investigate a differential effect by adding dummy variable

First lets create a dummy variable. We consider to create a variable to verify if its rainning or not.

weatherHistoryWithPressure$rain <- ifelse(weatherHistoryWithPressure$precipType == "rain", 1, 0)
table(weatherHistoryWithPressure$rain)
## 
##     0     1 
## 11049 84115

Next we have the both models. Results will be discuss later.

modelPressureSimple <- lm(pressure ~ windSpeed + humidity +  temperature , data = weatherHistoryWithPressure)
lm.beta(modelPressureSimple)
## 
## Call:
## lm(formula = pressure ~ windSpeed + humidity + temperature, data = weatherHistoryWithPressure)
## 
## Standardized Coefficients::
## (Intercept)   windSpeed    humidity temperature 
##          NA  -0.3341233  -0.3835520  -0.5497544
modelPressureDummy <- lm(pressure ~ windSpeed + humidity +  temperature + rain , data = weatherHistoryWithPressure)
lm.beta(modelPressureDummy)
## 
## Call:
## lm(formula = pressure ~ windSpeed + humidity + temperature + 
##     rain, data = weatherHistoryWithPressure)
## 
## Standardized Coefficients::
## (Intercept)   windSpeed    humidity temperature        rain 
##          NA -0.32379183 -0.36143180 -0.48525197 -0.09145702
summary(modelPressureSimple)
## 
## Call:
## lm(formula = pressure ~ windSpeed + humidity + temperature, data = weatherHistoryWithPressure)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.153  -3.811   0.382   4.172  24.327 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.037e+03  1.526e-01  6796.3   <2e-16 ***
## windSpeed   -3.771e-01  3.323e-03  -113.5   <2e-16 ***
## humidity    -1.525e+01  1.512e-01  -100.9   <2e-16 ***
## temperature -4.479e-01  3.019e-03  -148.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.779 on 95160 degrees of freedom
## Multiple R-squared:  0.2404, Adjusted R-squared:  0.2404 
## F-statistic: 1.004e+04 on 3 and 95160 DF,  p-value: < 2.2e-16
summary(modelPressureDummy)
## 
## Call:
## lm(formula = pressure ~ windSpeed + humidity + temperature + 
##     rain, data = weatherHistoryWithPressure)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.619  -3.795   0.391   4.131  24.111 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1038.00663    0.15359 6758.49   <2e-16 ***
## windSpeed     -0.36543    0.00334 -109.40   <2e-16 ***
## humidity     -14.37376    0.15432  -93.14   <2e-16 ***
## temperature   -0.39539    0.00361 -109.54   <2e-16 ***
## rain          -2.22065    0.08428  -26.35   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.755 on 95159 degrees of freedom
## Multiple R-squared:  0.2459, Adjusted R-squared:  0.2459 
## F-statistic:  7757 on 4 and 95159 DF,  p-value: < 2.2e-16

e) Investigate an interaction effect for windspeed and dummyvariable

Adding the interaction we need to add the term windSpeed:rain to understand the pressure correlates with windSpeed for rainy and non-rainy days. Conclusions will be done later.

modelPressureDummyInteraction <- lm(pressure ~ windSpeed + humidity +  temperature + rain  + windSpeed:rain, data = weatherHistoryWithPressure)
lm.beta(modelPressureDummyInteraction)
## 
## Call:
## lm(formula = pressure ~ windSpeed + humidity + temperature + 
##     rain + windSpeed:rain, data = weatherHistoryWithPressure)
## 
## Standardized Coefficients::
##    (Intercept)      windSpeed       humidity    temperature           rain 
##             NA     -0.4625417     -0.3587242     -0.4807740     -0.1632443 
## windSpeed:rain 
##      0.1668073
summary(modelPressureDummyInteraction)
## 
## Call:
## lm(formula = pressure ~ windSpeed + humidity + temperature + 
##     rain + windSpeed:rain, data = weatherHistoryWithPressure)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.449  -3.773   0.408   4.125  25.286 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.039e+03  1.750e-01 5938.20   <2e-16 ***
## windSpeed      -5.220e-01  9.949e-03  -52.47   <2e-16 ***
## humidity       -1.427e+01  1.542e-01  -92.50   <2e-16 ***
## temperature    -3.917e-01  3.611e-03 -108.49   <2e-16 ***
## rain           -3.964e+00  1.340e-01  -29.57   <2e-16 ***
## windSpeed:rain  1.754e-01  1.050e-02   16.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.745 on 95158 degrees of freedom
## Multiple R-squared:  0.2481, Adjusted R-squared:  0.2481 
## F-statistic:  6279 on 5 and 95158 DF,  p-value: < 2.2e-16

f) Report your findings

suppressWarnings(stargazer(modelPressureSimple, modelPressureDummy, modelPressureDummyInteraction, type="text",
                           title= "Regression Results", align = T, 
                           column.labels = c("Pressure", "Pressure with Dummy", "Pressure with Dummy and Interaction")))
## 
## Regression Results
## ==================================================================================================================
##                                                          Dependent variable:                                      
##                     ----------------------------------------------------------------------------------------------
##                                                                pressure                                           
##                               Pressure                Pressure with Dummy      Pressure with Dummy and Interaction
##                                  (1)                          (2)                              (3)                
## ------------------------------------------------------------------------------------------------------------------
## windSpeed                     -0.377***                    -0.365***                        -0.522***             
##                                (0.003)                      (0.003)                          (0.010)              
##                                                                                                                   
## humidity                     -15.253***                    -14.374***                      -14.266***             
##                                (0.151)                      (0.154)                          (0.154)              
##                                                                                                                   
## temperature                   -0.448***                    -0.395***                        -0.392***             
##                                (0.003)                      (0.004)                          (0.004)              
##                                                                                                                   
## rain                                                       -2.221***                        -3.964***             
##                                                             (0.084)                          (0.134)              
##                                                                                                                   
## windSpeed:rain                                                                              0.175***              
##                                                                                              (0.010)              
##                                                                                                                   
## Constant                    1,037.444***                  1,038.007***                    1,039.416***            
##                                (0.153)                      (0.154)                          (0.175)              
##                                                                                                                   
## ------------------------------------------------------------------------------------------------------------------
## Observations                   95,164                        95,164                          95,164               
## R2                              0.240                        0.246                            0.248               
## Adjusted R2                     0.240                        0.246                            0.248               
## Residual Std. Error      6.779 (df = 95160)            6.755 (df = 95159)              6.745 (df = 95158)         
## F Statistic         10,037.910*** (df = 3; 95160) 7,756.866*** (df = 4; 95159)    6,279.450*** (df = 5; 95158)    
## ==================================================================================================================
## Note:                                                                                  *p<0.1; **p<0.05; ***p<0.01

Comparing the both models we find the following conclusions:

  • The dummy variable rain its interaction with windSpeed increases the explainability of the pressure.
  • The addition of rain and its interaction with windSpeed increase the explainability 0.8%. We can tell that the effect of windSpeed in pressure can differ on rainy or non-rainy days;
  • The residuals are sightly lower in the latest model than the first without the dummy variable. It means the inclusion made the prediction more accurate.

Question 3 - Logistic regression [4 marks]

Consider the dataset “heartfailure.csv”.

  1. Build a model considering diabetes as predictor;
  2. Calculate and analyze odds ratio of the model;
  3. Extend the model by considering variable age. (Convert the age into categorical data, if age < 55 , category 1; age is between 55 and 68, category 2; otherwise category 3 );
  4. Report your findings;

Answer

a) Build a model considering diabetes as predictor

Load the dataset and analyze the structure.

heartFailure <- read.csv("heartfailure.csv")
str(heartFailure)
## 'data.frame':    299 obs. of  13 variables:
##  $ age                     : num  75 55 65 50 65 90 75 60 65 80 ...
##  $ anaemia                 : int  0 0 0 1 1 1 1 1 0 1 ...
##  $ creatinine_phosphokinase: int  582 7861 146 111 160 47 246 315 157 123 ...
##  $ diabetes                : int  0 0 0 0 1 0 0 1 0 0 ...
##  $ ejection_fraction       : int  20 38 20 20 20 40 15 60 65 35 ...
##  $ high_blood_pressure     : int  1 0 0 0 0 1 0 0 0 1 ...
##  $ platelets               : num  265000 263358 162000 210000 327000 ...
##  $ serum_creatinine        : num  1.9 1.1 1.3 1.9 2.7 2.1 1.2 1.1 1.5 9.4 ...
##  $ serum_sodium            : int  130 136 129 137 116 132 137 131 138 133 ...
##  $ sex                     : int  1 1 1 1 0 1 1 1 0 1 ...
##  $ smoking                 : int  0 0 1 0 0 1 0 1 0 1 ...
##  $ time                    : int  4 6 7 7 8 8 10 10 10 10 ...
##  $ DEATH_EVENT             : int  1 1 1 1 1 1 1 1 1 1 ...

Either diabetes predictor and DEATH_EVENT are Binary Variables. In the structure they are represent as a int, so they need to be corrected.

heartFailure$diabetes <- factor(heartFailure$diabetes, levels = c(0, 1))
heartFailure$DEATH_EVENT <- factor(heartFailure$DEATH_EVENT, levels = c(0, 1))
str(heartFailure)
## 'data.frame':    299 obs. of  13 variables:
##  $ age                     : num  75 55 65 50 65 90 75 60 65 80 ...
##  $ anaemia                 : int  0 0 0 1 1 1 1 1 0 1 ...
##  $ creatinine_phosphokinase: int  582 7861 146 111 160 47 246 315 157 123 ...
##  $ diabetes                : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 2 1 1 ...
##  $ ejection_fraction       : int  20 38 20 20 20 40 15 60 65 35 ...
##  $ high_blood_pressure     : int  1 0 0 0 0 1 0 0 0 1 ...
##  $ platelets               : num  265000 263358 162000 210000 327000 ...
##  $ serum_creatinine        : num  1.9 1.1 1.3 1.9 2.7 2.1 1.2 1.1 1.5 9.4 ...
##  $ serum_sodium            : int  130 136 129 137 116 132 137 131 138 133 ...
##  $ sex                     : int  1 1 1 1 0 1 1 1 0 1 ...
##  $ smoking                 : int  0 0 1 0 0 1 0 1 0 1 ...
##  $ time                    : int  4 6 7 7 8 8 10 10 10 10 ...
##  $ DEATH_EVENT             : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...

Describing both variables we got:

table(heartFailure$diabetes)
## 
##   0   1 
## 174 125
table(heartFailure$DEATH_EVENT)
## 
##   0   1 
## 203  96
ggplot(heartFailure, aes(x = factor(diabetes), fill = factor(DEATH_EVENT))) +
  geom_bar(position = "fill") +
  scale_x_discrete(labels = c("No", "Yes")) +
  scale_fill_manual(values = c("darkgreen", "darkred"), labels = c("Survived", "Died")) +
  labs(title = "Proportion of Deaths by Diabetes",
       x = "Diabetes", y = "Proportion", fill = "Outcome")

By the chart proportion histogram we have two insights:

  • Both groups has more survivals than deaths.
  • The proportion of death between Deaths and Non-Diabets are similar suggesting that diabetes is not a good predictor for the outcome.

Modelling the

modelDiabetes <- glm(DEATH_EVENT ~ diabetes, data = heartFailure, family = binomial())

summary(modelDiabetes)
## 
## Call:
## glm(formula = DEATH_EVENT ~ diabetes, family = binomial(), data = heartFailure)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.745333   0.162270  -4.593 4.37e-06 ***
## diabetes1   -0.008439   0.251190  -0.034    0.973    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 375.35  on 298  degrees of freedom
## Residual deviance: 375.35  on 297  degrees of freedom
## AIC: 379.35
## 
## Number of Fisher Scoring iterations: 4
suppressWarnings(stargazer(modelDiabetes, type="text"))
## 
## =============================================
##                       Dependent variable:    
##                   ---------------------------
##                           DEATH_EVENT        
## ---------------------------------------------
## diabetes1                   -0.008           
##                             (0.251)          
##                                              
## Constant                   -0.745***         
##                             (0.162)          
##                                              
## ---------------------------------------------
## Observations                  299            
## Log Likelihood             -187.674          
## Akaike Inf. Crit.           379.348          
## =============================================
## Note:             *p<0.1; **p<0.05; ***p<0.01

So we got the following model equation:

\[ p(x) = \frac{1}{e^{-(intercept + b_{diabetes} \times diabetes)}} = \frac{1}{e^{-(-0.745333 - 0.008439 \times diabetes)}} = \frac{1}{e^{(0.745333 + 0.008439 \times diabetes)}} \]

Because diabetes is a binary variable we have 2 possible values:

\[ p(0) = \frac{1}{e^{0.745}} \\ p(1) = \frac{1}{e^{(0.745 + 0.008)}} \]

We evaluate the model doing the 3 tests: - Likelihood Ratio Test: Explain the variable explanability of the model. Good to compare if the curve fits better compared with other models with other variables; - Verify the Confusion Matrix. Good to measure the performance and accuracy; - ROC and AUC curve.The ROC curve visually shows the trade-off between sensitivity and specificity for every possible cut-off. The AUC value quantifies the overall ability of the test to discriminate between the individuals having the outcome and those not having the outcome.

Likelihood ratio test

lrtest(modelDiabetes)
## Likelihood ratio test
## 
## Model 1: DEATH_EVENT ~ diabetes
## Model 2: DEATH_EVENT ~ 1
##   #Df  LogLik Df  Chisq Pr(>Chisq)
## 1   2 -187.67                     
## 2   1 -187.67 -1 0.0011     0.9732
lrtest(modelDiabetes)
## Likelihood ratio test
## 
## Model 1: DEATH_EVENT ~ diabetes
## Model 2: DEATH_EVENT ~ 1
##   #Df  LogLik Df  Chisq Pr(>Chisq)
## 1   2 -187.67                     
## 2   1 -187.67 -1 0.0011     0.9732
predicted_probs <- predict(modelDiabetes, type = "response")
predicted_classes <- ifelse(predicted_probs > 0.5, 1, 0)
actual_classes <- heartFailure$DEATH_EVENT

confusion_matrix <- suppressWarnings(caret::confusionMatrix(factor(predicted_classes), factor(actual_classes), positive = "1"))
print(confusion_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 203  96
##          1   0   0
##                                           
##                Accuracy : 0.6789          
##                  95% CI : (0.6228, 0.7315)
##     No Information Rate : 0.6789          
##     P-Value [Acc > NIR] : 0.5276          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.6789          
##              Prevalence : 0.3211          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 1               
## 
roc_curve <- roc(heartFailure$DEATH_EVENT, fitted(modelDiabetes))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve, main="ROC Curve for Diabetes Model", col="blue", print.auc = T)

Conclusions:

  • Intercept of -0.745 with standard error of 0.162. This negative value indicates the absence of diabetes. The p-value is 4.37e-06 meaning that this value is statically significant for a confident level of 99.9%;
  • Diabetes coefficient of -0.008430 with a standard error of 0.251; the p-value of 0.973 is almost 1 (much greater than 0.05) we can conclude that there is no statically significant evidence that diabetes has an effect on probability;
  • The AUC is the same than the ROC curve meaning that diabetes is irrelevant to predict heart failure;
  • The confusion matrix shows that the model fails to identify true positives (sensitivity = 0) suggesting that diabetes is not a good indicator;
  • The Likelihood ratio shows no improvement adding the diabetes variable to the model (no change compared with the reference value).

b) Calculate and analyze odds ratio of the model

coefDiabetes <- coef(modelDiabetes)["diabetes1"]
exp(coefDiabetes)
## diabetes1 
## 0.9915966

To calculate the odds ratio, we exponentiate the coefficient of diabetes. This coefficient represents the log-odds of diabetes influencing the outcome. The odds ratio itself quantifies the strength of the association between diabetes and the occurrence of a DEATH_EVENT.

Because the Odds Ratio is close to 1 (\(OR_{diabetes} \approx 1\)) this means the diabetes has little to no effect predicting DEATH_EVENT.

This is confirmed our p-value 0.973 (Wald Test) that is that is not statistical significant.

c) Extend the model by considering variable age.

(Convert the age into categorical data, 
- if age < 55 , category 1; 
- age is between 55 and 68 category 2 ; 
- otherwise category 3)

First we well create a new category and add it for our dataset.

heartFailure$AGE_CAT <- cut(heartFailure$age, 
                            breaks = c(-Inf, 55, 68, Inf), 
                            labels = c("1", "2", "3"),
                            right = FALSE)

ggplot(heartFailure, aes(x = factor(AGE_CAT), fill = factor(DEATH_EVENT))) +
  geom_bar(position = "fill") +
  scale_x_discrete(labels = c("0-54", "55-67", "68+")) +
  scale_fill_manual(values = c("darkgreen", "darkred"), labels = c("Survived", "Died")) +
  labs(title = "Proportion of Deaths by Age",
       x = "Diabetes", y = "Proportion", fill = "Outcome")

Extend the model

modelDeathDiabetesAge <- glm(DEATH_EVENT ~ diabetes + AGE_CAT, family = binomial(), data = heartFailure) 
summary(modelDeathDiabetesAge)
## 
## Call:
## glm(formula = DEATH_EVENT ~ diabetes + AGE_CAT, family = binomial(), 
##     data = heartFailure)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.2695     0.2709  -4.686 2.79e-06 ***
## diabetes1     0.1586     0.2633   0.602 0.547029    
## AGE_CAT2      0.1893     0.3198   0.592 0.553987    
## AGE_CAT3      1.1993     0.3288   3.648 0.000264 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 375.35  on 298  degrees of freedom
## Residual deviance: 358.84  on 295  degrees of freedom
## AIC: 366.84
## 
## Number of Fisher Scoring iterations: 4
suppressWarnings(stargazer(modelDeathDiabetesAge, type="text"))
## 
## =============================================
##                       Dependent variable:    
##                   ---------------------------
##                           DEATH_EVENT        
## ---------------------------------------------
## diabetes1                    0.159           
##                             (0.263)          
##                                              
## AGE_CAT2                     0.189           
##                             (0.320)          
##                                              
## AGE_CAT3                   1.199***          
##                             (0.329)          
##                                              
## Constant                   -1.270***         
##                             (0.271)          
##                                              
## ---------------------------------------------
## Observations                  299            
## Log Likelihood             -179.420          
## Akaike Inf. Crit.           366.841          
## =============================================
## Note:             *p<0.1; **p<0.05; ***p<0.01

Likelihood ratio test

lrtest(modelDeathDiabetesAge)
## Likelihood ratio test
## 
## Model 1: DEATH_EVENT ~ diabetes + AGE_CAT
## Model 2: DEATH_EVENT ~ 1
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1   4 -179.42                         
## 2   1 -187.67 -3 16.508  0.0008919 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Confusion Matrix:

predicted_probs_wage <- predict(modelDeathDiabetesAge, type = "response")
predicted_classes_wage <- ifelse(predicted_probs_wage > 0.5, 1, 0)
actual_classes_wage <- heartFailure$DEATH_EVENT

confusion_matrix_wage <- suppressWarnings(caret::confusionMatrix(factor(predicted_classes_wage), factor(actual_classes_wage), positive = "1"))
print(confusion_matrix_wage)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 190  84
##          1  13  12
##                                           
##                Accuracy : 0.6756          
##                  95% CI : (0.6193, 0.7283)
##     No Information Rate : 0.6789          
##     P-Value [Acc > NIR] : 0.5765          
##                                           
##                   Kappa : 0.0757          
##                                           
##  Mcnemar's Test P-Value : 1.182e-12       
##                                           
##             Sensitivity : 0.12500         
##             Specificity : 0.93596         
##          Pos Pred Value : 0.48000         
##          Neg Pred Value : 0.69343         
##              Prevalence : 0.32107         
##          Detection Rate : 0.04013         
##    Detection Prevalence : 0.08361         
##       Balanced Accuracy : 0.53048         
##                                           
##        'Positive' Class : 1               
## 

And the ROC Curve:

roc_curve_wage <- roc(heartFailure$DEATH_EVENT, fitted(modelDeathDiabetesAge))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve_wage, main="ROC Curve for Diabetes + Age Model", col="blue", print.auc = T)

Model we consider the reference level as \(diabetes = 0\) and \(AGE\_CAT = 1\):

\[ p(x) = \frac{1}{e^{-(\beta_{intercept} + \beta_{diabetes} \times diabetes + \beta_{AGE\_CAT2} \times AGE\_CAT2 + \beta_{AGE\_CAT3} \times AGE\_CAT3)}} \] \[ p(x) = \frac{1}{e^{-(−1.2695 + 0.1586 \times diabetes + 0.1893 \times AGE\_CAT2 + 1.1993 \times AGE\_CAT3)}} \] \[ p(x) = \frac{1}{e^{1.2695 - 0.1586 \times diabetes - 0.1893 \times AGE\_CAT2 - 1.1993 \times AGE\_CAT3)}} \]

We can use the log form equivalent of this equation: \[ log\left(\frac{p}{1 - p}\right) = \beta_{intercept} + \beta_{diabetes} \times diabetes + \beta_{AGE\_CAT2} \times AGE\_CAT2 + \beta_{AGE\_CAT3} \times AGE\_CAT3) \] \[ log\left(\frac{p}{1 - p}\right) = −1.2695 + 0.1586 \times diabetes + 0.1893 \times AGE\_CAT2 + 1.1993 \times AGE\_CAT3 \]

Conclusions:

  • As we saw by the proportion histogram there is almost 50% of deaths in the Category 3 of (age 68+). Age looks a good predictor for our model;
  • Intercept coefficient for \(diabetes = 0\) and \(AGE\_CAT = 1\) is -1.2695. The probability \(p = 2.79e-6\) which indicates a high significant level;
  • The p-value of diabetes is \(0.547029\) so diabetes is not statistical significant. The increase is minimal with a \(\beta_{diabetes} = 0.1586\);
  • The same can be said for AGE_CAT2 that has a p-value of \(p-value = 0.553987\). Increase in the DEATH_EVENT is residual (\(\beta_{AGE\_CAT2} = 0.1893\));
  • AGE_CAT3 (68 years and older) shows a substantial increase in the log odds of a DEATH_EVENT. It has a (\(\beta_{AGE\_CAT3} = 1.1993\)) and p-value of $ (p = 0.000264)$ who makes it statistical significant. This assumption is reasonable because as you get older you have more change of die.
  • Likelihood ratio test shows that including diabetes and AGE_CAT in the model significantly improves the model’s explanatory power compared to just using the intercept. We know that we are considering only the AGE_CAT because we saw in the earlier that diabetes didn’t improve the explainability of the model;
  • In the confusion matrix we can see that we have a huge number of False Negatives (84) who contribute for a 67.56% of accuracy. We also have a P value of 0.5765 indicating that the test is not significant.
  • We have a sensitivity of 12.5% is quite low but better than the model without the age.

d) Report your finding

So let’s compare both models

suppressWarnings(stargazer(modelDiabetes, modelDeathDiabetesAge, 
                           type="text", 
                           title= "Regression Results",
                           column.labels = c("Diabetes", "Diabetes And Age"), 
                           align = T))
## 
## Regression Results
## ==============================================
##                       Dependent variable:     
##                   ----------------------------
##                           DEATH_EVENT         
##                    Diabetes   Diabetes And Age
##                       (1)           (2)       
## ----------------------------------------------
## diabetes1           -0.008         0.159      
##                     (0.251)       (0.263)     
##                                               
## AGE_CAT2                           0.189      
##                                   (0.320)     
##                                               
## AGE_CAT3                          1.199***    
##                                   (0.329)     
##                                               
## Constant           -0.745***     -1.270***    
##                     (0.162)       (0.271)     
##                                               
## ----------------------------------------------
## Observations          299           299       
## Log Likelihood     -187.674       -179.420    
## Akaike Inf. Crit.   379.348       366.841     
## ==============================================
## Note:              *p<0.1; **p<0.05; ***p<0.01
par(mfrow=c(1,2))
plot(roc_curve, main="ROC Diabetes", col="blue", print.auc = T)
plot(roc_curve_wage, main="ROC Diabetes + Age", col="blue", print.auc = T)

Based on our previous findings and look at the ROC charts we can say that the first model its the same than piking randomly who will die or not. So the predictor diabetes its the same than the reference model. When we add the variable age for the category 68+ we got a explainability of 12.5% what makes the age a good predictor but needs to be pair with other predictors to increase the explainability.

We also got an improvement in the second model with an increase of the Likehood Ratio Test and (from -187.674 to -179.420) and a diminuition of the AIC (from 379.348 to 366.841).